import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

print(__version__) # requires version >= 1.9.0

2.7.0

import cufflinks as cf

# For Notebooks
init_notebook_mode(connected=True)

# For offline use
cf.go_offline()

df = pd.read_csv('WorldCups.csv')
df1 = pd.read_csv('WorldCupMatches.csv')
df2 = pd.read_csv('WorldCupPlayers.csv')

OVERALL WORLD CUP DATAFRAME

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
Year              20 non-null int64
Country           20 non-null object
Winner            20 non-null object
Runners-Up        20 non-null object
Third             20 non-null object
Fourth            20 non-null object
GoalsScored       20 non-null int64
QualifiedTeams    20 non-null int64
MatchesPlayed     20 non-null int64
Attendance        20 non-null object
dtypes: int64(4), object(6)
memory usage: 1.6+ KB

#Exploring data
df.head(20)

df['Attendance'][0:3].astype(float).mean() #Only 0 to 3 work because they can be converted to floats (they only have 1 decimal)

443.083

#We have run into problem of there being periods in the data instead of commas which prevents you from doing anything with
#those numbers. The fix would be a loop through that columns which takes the current value and reads the number, any time
#a period is found, replace it with a comma

#Before writing a lambda expression to change the data, lets see if I can graph it and maybe have better insight
# sns.distplot(df['Attendance'])

#This verified there is a problem with how the original data is formatted. Need to clean this column.

def delete_periods(string):
    string = string.replace(".", "")
    return int(string)
    
df['Attendance'] = df['Attendance'].apply(lambda x: delete_periods(x))

#Testing if all numbers are now valid. Since function works and average makes sense, it is now valid and ready for analysis
df['Attendance'].mean()

1872882.35

#Graphing World Cup Attendance
plt.style.use('seaborn-darkgrid')
graph = df.plot.line(x='Year',y='Attendance',figsize=(16,8),lw=3)
graph.set_title("Number of World Cup Attendees");
graph.set_xlabel("Year")

Text(0.5,0,'Year')

#Insight: Attendance has increased over the years with the most attended world cup being in 1994, no other has been
#as attended since then

#Now I start looking to see which country has won the most

plt.style.use('fivethirtyeight')
graph = df['Winner'].hist(bins=20,figsize=(16,6))
graph.set_title("Number of World Cup Wins");
graph.set_ylabel("Games Won")
graph

<matplotlib.axes._subplots.AxesSubplot at 0x1e2a1c6aa58>

plt.style.use('fivethirtyeight')
graph = df['Runners-Up'].hist(bins=30,figsize=(16,6))
graph.set_title("Number of times World Cup Runners-Up");
graph.set_ylabel("Number of times")
graph

<matplotlib.axes._subplots.AxesSubplot at 0x1e2a1e03ba8>

plt.style.use('seaborn-muted')
graph = df['Third'].hist(bins=35,figsize=(20,6))
graph.set_title("Number of times World Cup Third Place");
graph.set_ylabel("Number of times")
graph

<matplotlib.axes._subplots.AxesSubplot at 0x1e2a1ebe518>

plt.style.use('seaborn-pastel')
graph = df['Fourth'].hist(bins=35,figsize=(25,6))
graph.set_title("Number of times World Cup Fourth Place");
graph.set_ylabel("Number of times")
graph

<matplotlib.axes._subplots.AxesSubplot at 0x1e2a2189438>

#Aggregating top 3 placements
df_first = df['Winner']
df_second = df['Runners-Up']
df_third = df['Third']

frames = [df_first,df_second,df_third]
result = pd.concat(frames)

plt.style.use('seaborn-pastel')
graph = result.hist(bins=40,figsize=(35,10))
graph.set_title("Number of World Cup Top 3 Placements");
graph.set_ylabel("Number of times")
#plt.savefig('img/top3.png', bbox_inches='tight')

Text(0,0.5,'Number of times')

#Graphing Number of Goals Scored
plt.style.use('seaborn-poster')
graph = df.plot.line(x='Year',y='GoalsScored',figsize=(16,8),lw=3)
graph.set_title("Number of Goals Scored");
graph.set_xlabel("Year")

Text(0.5,0,'Year')

#Graphing Changes in World Cup tournament

plt.style.use('seaborn-deep')
plt.figure(figsize=(16,8))
plt.plot( 'Year', 'QualifiedTeams', data=df, linewidth=2,label="Number of Teams")
plt.plot( 'Year', 'MatchesPlayed', data=df, linewidth=2,label="Number of Matches Played")
plt.title("Changes in World Cup");
plt.legend()



#df.plot(x='Year',y='QualifiedTeams')
#df.plot(x='Year',y='MatchesPlayed')
#plt.show()

<matplotlib.legend.Legend at 0x1e2a2602898>

#Insight: On the surface it seems that the number of goals increases as time went on, most likely because we are becoming
#more powerful super humans. Unfortunately that isn't the case and it is explained by the changes to the tournament
#increase of number of teams and matches played

MATCHES DATA FRAME

df1.info() #Matches

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 20 columns):
Year                    852 non-null float64
Datetime                852 non-null object
Stage                   852 non-null object
Stadium                 852 non-null object
City                    852 non-null object
Home Team Name          852 non-null object
Home Team Goals         852 non-null float64
Away Team Goals         852 non-null float64
Away Team Name          852 non-null object
Win conditions          852 non-null object
Attendance              850 non-null float64
Half-time Home Goals    852 non-null float64
Half-time Away Goals    852 non-null float64
Referee                 852 non-null object
Assistant 1             852 non-null object
Assistant 2             852 non-null object
RoundID                 852 non-null float64
MatchID                 852 non-null float64
Home Team Initials      852 non-null object
Away Team Initials      852 non-null object
dtypes: float64(8), object(12)
memory usage: 714.5+ KB

#Question to answer: Whether more goals were scored after half-time?

df1.head()

ht_goals = df1['Home Team Goals'].sum() #home team goals total

at_goals = df1['Away Team Goals'].sum() #away team goals total

htht_goals = df1['Half-time Home Goals'].sum() #before half time home team goals

htat_goals = df1['Half-time Away Goals'].sum() #before half time away team goals

ahtht_goals = ht_goals - htht_goals
ahtht_goals #after half time home team goals

939.0

ahtat_goals = at_goals - htat_goals
ahtat_goals #after half time away team goals

506.0

#Creating a dataframe of the individual metrics

goalframe = pd.DataFrame(data={'Before Half-Time': [htht_goals, htat_goals], 'After Half-Time': [ahtht_goals, ahtat_goals]}, columns=['Before Half-Time','After Half-Time'], index=['Home','Away'])
goalframe

goalframe.iplot(kind='bar', barmode='stack', filename='cufflinks/grouped-bar-chart', title="Total World Cup Goals", yTitle="Goals Scored")

PLAYERS DATAFRAME

df2.info() #Players

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37784 entries, 0 to 37783
Data columns (total 9 columns):
RoundID          37784 non-null int64
MatchID          37784 non-null int64
Team Initials    37784 non-null object
Coach Name       37784 non-null object
Line-up          37784 non-null object
Shirt Number     37784 non-null int64
Player Name      37784 non-null object
Position         4143 non-null object
Event            9069 non-null object
dtypes: int64(3), object(6)
memory usage: 2.6+ MB

df2.head()

	Year	Country	Winner	Runners-Up	Third	Fourth	GoalsScored	QualifiedTeams	MatchesPlayed	Attendance
0	1930	Uruguay	Uruguay	Argentina	USA	Yugoslavia	70	13	18	590.549
1	1934	Italy	Italy	Czechoslovakia	Germany	Austria	70	16	17	363.000
2	1938	France	Italy	Hungary	Brazil	Sweden	84	15	18	375.700
3	1950	Brazil	Uruguay	Brazil	Sweden	Spain	88	13	22	1.045.246
4	1954	Switzerland	Germany FR	Hungary	Austria	Uruguay	140	16	26	768.607
5	1958	Sweden	Brazil	Sweden	France	Germany FR	126	16	35	819.810
6	1962	Chile	Brazil	Czechoslovakia	Chile	Yugoslavia	89	16	32	893.172
7	1966	England	England	Germany FR	Portugal	Soviet Union	89	16	32	1.563.135
8	1970	Mexico	Brazil	Italy	Germany FR	Uruguay	95	16	32	1.603.975
9	1974	Germany	Germany FR	Netherlands	Poland	Brazil	97	16	38	1.865.753
10	1978	Argentina	Argentina	Netherlands	Brazil	Italy	102	16	38	1.545.791
11	1982	Spain	Italy	Germany FR	Poland	France	146	24	52	2.109.723
12	1986	Mexico	Argentina	Germany FR	France	Belgium	132	24	52	2.394.031
13	1990	Italy	Germany FR	Argentina	Italy	England	115	24	52	2.516.215
14	1994	USA	Brazil	Italy	Sweden	Bulgaria	141	24	52	3.587.538
15	1998	France	France	Brazil	Croatia	Netherlands	171	32	64	2.785.100
16	2002	Korea/Japan	Brazil	Germany	Turkey	Korea Republic	161	32	64	2.705.197
17	2006	Germany	Italy	France	Germany	Portugal	147	32	64	3.359.439
18	2010	South Africa	Spain	Netherlands	Germany	Uruguay	145	32	64	3.178.856
19	2014	Brazil	Germany	Argentina	Netherlands	Brazil	171	32	64	3.386.810

	Year	Datetime	Stage	Stadium	City	Home Team Name	Home Team Goals	Away Team Goals	Away Team Name	Attendance	Half-time Home Goals	Referee	Assistant 1	Assistant 2	RoundID	MatchID	Home Team Initials	Away Team Initials
0	1930.0	13 Jul 1930 - 15:00	Group 1	Pocitos	Montevideo	France	4.0	1.0	Mexico	4444.0	3.0	LOMBARDI Domingo (URU)	CRISTOPHE Henry (BEL)	REGO Gilberto (BRA)	201.0	1096.0	FRA	MEX
1	1930.0	13 Jul 1930 - 15:00	Group 4	Parque Central	Montevideo	USA	3.0	0.0	Belgium	18346.0	2.0	MACIAS Jose (ARG)	MATEUCCI Francisco (URU)	WARNKEN Alberto (CHI)	201.0	1090.0	USA	BEL
2	1930.0	14 Jul 1930 - 12:45	Group 2	Parque Central	Montevideo	Yugoslavia	2.0	1.0	Brazil	24059.0	2.0	TEJADA Anibal (URU)	VALLARINO Ricardo (URU)	BALWAY Thomas (FRA)	201.0	1093.0	YUG	BRA
3	1930.0	14 Jul 1930 - 14:50	Group 3	Pocitos	Montevideo	Romania	3.0	1.0	Peru	2549.0	1.0	WARNKEN Alberto (CHI)	LANGENUS Jean (BEL)	MATEUCCI Francisco (URU)	201.0	1098.0	ROU	PER
4	1930.0	15 Jul 1930 - 16:00	Group 1	Parque Central	Montevideo	Argentina	1.0	0.0	France	23409.0	0.0	REGO Gilberto (BRA)	SAUCEDO Ulises (BOL)	RADULESCU Constantin (ROU)	201.0	1085.0	ARG	FRA

	RoundID	MatchID	Team Initials	Coach Name	Line-up	Player Name	Position	Event
0	201	1096	FRA	CAUDRON Raoul (FRA)	S	Alex THEPOT	GK	NaN
1	201	1096	MEX	LUQUE Juan (MEX)	S	Oscar BONFIGLIO	GK	NaN
2	201	1096	FRA	CAUDRON Raoul (FRA)	S	Marcel LANGILLER	NaN	G40'
3	201	1096	MEX	LUQUE Juan (MEX)	S	Juan CARRENO	NaN	G70'
4	201	1096	FRA	CAUDRON Raoul (FRA)	S	Ernest LIBERATI	NaN	NaN

	Before Half-Time	After Half-Time
Home	604.0	939.0
Away	365.0	506.0